Pipeline:

  1. Loading Data and Packages

  2. Quality control of Raw Counts

  3. Filtering and Normalization

  4. Quality control of data, post filering and normalization

  5. Set up design matrix and contrasts

  6. Estimate Dispersion and Fit the Model

  7. Perform Differential Expression Tests

  8. Visualization plots

  1. Pathway analysis
  1. Gene Set Enrichment Analysis

1) Loading Data and Packages

# Set working directory to project folder
setwd("~/Documents/Project/Salivary_glands_project")

# Load required libraries
library(edgeR)           # Differential expression analysis
library(ggplot2)         # Data visualization
library(RColorBrewer)    # Color palettes
library(reshape2)        # Data reshaping
library(pheatmap)        # Heatmap visualization
library(ggrepel)         # Enhanced text labels in ggplot2
library(kableExtra)      # Enhanced tables
library(clusterProfiler) # Functional profiling of gene clusters
library(org.Mm.eg.db)    # Mouse genome annotation
library(ReactomePA)      # Reactome pathway analysis
library(annotables)      # Gene annotations
library(tidyverse)       # Data manipulation and visualization
library(AnnotationDbi)   # Annotation interface
library(EnhancedVolcano) # Enhanced volcano plots
library(enrichplot)      # Enrichment visualization
library(ggforce)         # For bubble shapes around clusters
library(msigdbr)         # For MSigDB gene sets
library(ggridges)        # Ridge Plots
library(KEGGREST)        #Allows access to the KEGG REST API

# Set working directory
setwd("~/Documents/Project/Salivary_glands_project")  # Update this path as necessary

# Imported raw count data from featurecounts tool
Data<- read.csv("readCounts2.tsv",
                header = TRUE, 
                sep = '\t',
                row.names = 'Geneid',
                skip=1)

# View(Data)

# Dropped extra columns from featureCount output
drops <- c("Chr","Start","End","Strand","Length")
# head(drops)
Data <- Data[ , !(names(Data) %in% drops)]
# colnames(Data)

# Converted count data to matrix and ensure integer mode
countData <- as.matrix(Data)
mode(countData) <- "integer"

# View(countData)

# Loaded colData as metadata
metadata <- read.csv("info.csv", sep=",", row.names=1)

# Displayed first 50 rows of metadata
kable(head(metadata, 50), digits=5) %>% kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "55%", height = "500px")
Strain Glands Sex Sample.1
Par_1A_Mal CD1 Par Male Par_1A_Mal
Par_1B_Mal C57 Par Male Par_1B_Mal
Par_2A_Mal CD1 Par Male Par_2A_Mal
Par_2B_Mal C57 Par Male Par_2B_Mal
Par_3A_Mal CD1 Par Male Par_3A_Mal
Par_3B_Mal C57 Par Male Par_3B_Mal
Par_4A_Fem CD1 Par Female Par_4A_Fem
Par_4B_Fem C57 Par Female Par_4B_Fem
Par_5A_Fem CD1 Par Female Par_5A_Fem
Par_5B_Fem C57 Par Female Par_5B_Fem
Par_6A_Fem CD1 Par Female Par_6A_Fem
Par_6B_Fem C57 Par Female Par_6B_Fem
SL_1A_Mal CD1 SL Male SL_1A_Mal
SL_1B_Mal C57 SL Male SL_1B_Mal
SL_2A_Mal CD1 SL Male SL_2A_Mal
SL_2B_Mal C57 SL Male SL_2B_Mal
SL_3A_Mal CD1 SL Male SL_3A_Mal
SL_3B_Mal C57 SL Male SL_3B_Mal
SL_4A_Fem CD1 SL Female SL_4A_Fem
SL_4B_Fem C57 SL Female SL_4B_Fem
SL_5A_Fem CD1 SL Female SL_5A_Fem
SL_5B_Fem C57 SL Female SL_5B_Fem
SL_6A_Fem CD1 SL Female SL_6A_Fem
SL_6B_Fem C57 SL Female SL_6B_Fem
SM_1A_Mal CD1 SM Male SM_1A_Mal
SM_1B_Mal C57 SM Male SM_1B_Mal
SM_2A_Mal CD1 SM Male SM_2A_Mal
SM_2B_Mal C57 SM Male SM_2B_Mal
SM_3A_Mal CD1 SM Male SM_3A_Mal
SM_3B_Mal C57 SM Male SM_3B_Mal
SM_4A_Fem CD1 SM Female SM_4A_Fem
SM_4B_Fem C57 SM Female SM_4B_Fem
SM_5A_Fem CD1 SM Female SM_5A_Fem
SM_5B_Fem C57 SM Female SM_5B_Fem
SM_6A_Fem CD1 SM Female SM_6A_Fem
SM_6B_Fem C57 SM Female SM_6B_Fem
# Checked all sample IDs in colData are also in CountData 
all(rownames(metadata) %in% colnames(countData))
## [1] TRUE
# Checking and matching the order of countData and metadata
countData <- countData[, rownames(metadata)]
all(rownames(metadata) == colnames(countData))
## [1] TRUE

2) Quality Control: Raw Counts

# Created a DGEList object with raw counts for downstream analysis and quality control
raw_counts <- DGEList(counts = countData)
  1. Inspecting Library Sizes
# Calculated and summarized library sizes
library_sizes <- colSums(raw_counts$counts)
summary(library_sizes)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 30179848 36029489 39188736 40403424 42433542 65308671
# Created a barplot to vizualize library sizes
ggplot(data = data.frame(Sample = names(library_sizes), Counts = library_sizes),
       aes(x = Sample, y = Counts)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Library Sizes", x = "Sample", y = "Total Counts")

  1. Distribution of Raw Counts
# Calculated log2 CPM values from raw counts
log_cpm <- cpm(raw_counts, log = TRUE, prior.count = 1)

# Prepared data for boxplot viz.
log_cpm_long <- melt(as.data.frame(log_cpm), id.vars = NULL, variable.name = "Sample", value.name = "Log2_CPM")
log_cpm_long$Strain <- metadata$Strain[match(log_cpm_long$Sample, rownames(metadata))]

# Plotted boxplots of gene expression distributions
ggplot(log_cpm_long, aes(x = Sample, y = Log2_CPM, fill = Strain)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Gene Expression Distribution (Raw Counts)", x = "Sample", y = "Log2(CPM + 1)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  guides(fill = guide_legend(title = "Strain"))

  1. Pearson correlation
# Calculated Pearson correlation matrix
cor_matrix <- cor(log_cpm, method = "pearson")

# Defined a color palette for heatmap
color_palette <- colorRampPalette(brewer.pal(9, "RdYlBu"))(50)

# Prepared annotation for heatmap
annotation_col <- data.frame(
  Strain = metadata$Strain,
  Glands = metadata$Glands,
  Sex = metadata$Sex
)
rownames(annotation_col) <- rownames(metadata)

# Generated correlation heatmap with annotations
pheatmap(cor_matrix, 
         main = "Pearson Correlation Heatmap (Raw Counts)", 
         color = color_palette,
         clustering_distance_rows = "correlation", 
         clustering_distance_cols = "correlation", 
         border_color = NA,
         display_numbers = FALSE,
         fontsize_row = 8,
         fontsize_col = 9,
         annotation_col = annotation_col,
         fontsize = 10)

4. Euclidean Distance

# Calculated Euclidean distance matrix
dist_matrix <- dist(t(log_cpm), method = "euclidean")
dist_matrix_full <- as.matrix(dist_matrix)

# Plotted heatmap with annotations, used the same color palette as the heatmap for Pearson correlation
pheatmap(dist_matrix_full, 
         main = "Euclidean Distance Heatmap (Raw Counts)", 
         color = color_palette,
         border_color = NA,
         display_numbers = FALSE,
         fontsize_row = 8,
         fontsize_col = 9,
         annotation_col = annotation_col,
         fontsize = 10)

  1. Principal Component Analysis
# Defined color palette for Glands and Sex
gland_colors <- c("Par" = "cyan2", "SL" = "seagreen1", "SM" = "lightcoral") # Assign colors to each gland type
sex_colors <- c("Male" = "purple", "Female" = "red") # Colors for Sex

# Performed PCA
pca <- prcomp(t(log_cpm), scale. = TRUE)

# Extracted PCA data and calculated variance explained
pca_data <- as.data.frame(pca$x)
percentVar <- round(100 * (pca$sdev^2 / sum(pca$sdev^2)), 1)

# Added sample metadata to PCA results
pca_data$SampleID <- rownames(pca_data)
pca_data$Strain <- metadata$Strain[match(pca_data$SampleID, rownames(metadata))]
pca_data$Glands <- metadata$Glands[match(pca_data$SampleID, rownames(metadata))]
pca_data$Sex <- metadata$Sex[match(pca_data$SampleID, rownames(metadata))]

# Plot PCA
ggplot(pca_data, aes(x = PC1, y = PC2, color = Glands, shape = Glands)) +
  geom_point(size = 3) +
#  geom_text_repel(size = 3, fontface = "bold", max.overlaps = 10, box.padding = 0.3, point.padding = 0.5) +
  scale_color_manual(values = gland_colors) +
  scale_shape_manual(values = c("Par" = 16, "SL" = 17, "SM" = 15)) + # Different shapes for each gland type
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(title = "PCA Plot of Gene Expression Data (Raw Counts)") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10),
    strip.text = element_text(size = 12, face = "bold"),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.position = "right"
  ) +
  guides(color = guide_legend(title = "Glands"),
         shape = guide_legend(title = "Glands")) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  geom_mark_ellipse(aes(fill = Sex), alpha = 0.1, color = NA) + # Ellipses filled based on Sex
  facet_wrap(~ Strain) + # Facet by Strain
  scale_fill_manual(values = sex_colors) + # Custom fill colors for ellipses based on Sex
  theme(panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_blank(),
        panel.border = element_blank())

3) Filtering and Normalization

# Created group factor based on gland types
group <- factor(metadata$Glands)

# Created DGEList object with group information
y <- DGEList(counts = countData, group = group)

# Filtered lowly expressed genes
keep <- filterByExpr(y)
y <- y[keep, , keep.lib.sizes = FALSE]

# Normalized the data using TMM method
y <- calcNormFactors(y)

# Displayed the first 50 rows of the normalized sample information
kable(head(y$samples, 50), digits=5) %>% kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "50%", height = "500px")
group lib.size norm.factors
Par_1A_Mal Par 49346283 1.04225
Par_1B_Mal Par 41958714 0.95690
Par_2A_Mal Par 39588858 0.71979
Par_2B_Mal Par 38521344 0.64772
Par_3A_Mal Par 65280666 0.77639
Par_3B_Mal Par 38812928 0.83174
Par_4A_Fem Par 50136011 0.56364
Par_4B_Fem Par 40097846 0.56423
Par_5A_Fem Par 39029949 1.38403
Par_5B_Fem Par 36458387 1.01408
Par_6A_Fem Par 42659603 0.92477
Par_6B_Fem Par 39232788 0.52092
SL_1A_Mal SL 48441001 1.46385
SL_1B_Mal SL 40594504 1.23048
SL_2A_Mal SL 43189949 1.59504
SL_2B_Mal SL 44423292 1.24701
SL_3A_Mal SL 33139664 1.51100
SL_3B_Mal SL 38365403 1.31636
SL_4A_Fem SL 46153401 1.48888
SL_4B_Fem SL 38555200 1.40968
SL_5A_Fem SL 32477171 1.46581
SL_5B_Fem SL 42336706 1.52588
SL_6A_Fem SL 41094426 1.45802
SL_6B_Fem SL 33817200 1.32851
SM_1A_Mal SM 30173355 0.62333
SM_1B_Mal SM 33349955 0.95360
SM_2A_Mal SM 41966995 0.86098
SM_2B_Mal SM 39124401 0.94713
SM_3A_Mal SM 34166773 1.00972
SM_3B_Mal SM 36254282 1.07864
SM_4A_Fem SM 32913394 0.89073
SM_4B_Fem SM 54266434 0.72644
SM_5A_Fem SM 36841471 1.03019
SM_5B_Fem SM 35307553 0.94457
SM_6A_Fem SM 34370981 0.82847
SM_6B_Fem SM 41257935 0.86801
# Calculated log2 CPM values after normalization
log_cpm_norm <- cpm(y, log=TRUE, prior.count=1)

4) Quality Control: Post normalization

  1. Distribution of Normalized Counts
# Prepared the data for visualization
log_cpm_norm_long <- melt(as.data.frame(log_cpm_norm), id.vars = NULL, variable.name = "Sample", value.name = "Log2_CPM")
log_cpm_norm_long$Strain <- metadata$Strain[match(log_cpm_norm_long$Sample, rownames(metadata))]

# Plotted boxplots of gene expression distributions
ggplot(log_cpm_norm_long, aes(x = Sample, y = Log2_CPM, fill = Strain)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Gene Expression Distribution (Raw Counts)", x = "Sample", y = "Log2(CPM + 1)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  guides(fill = guide_legend(title = "Strain"))

  1. Pearson Correlation
# Calculated Pearson correlation matrix for normalized data
cor_matrix_norm <- cor(log_cpm_norm, method = "pearson")

# Generated heatmap to vizualize sample correlation using pearson method
pheatmap(cor_matrix_norm, 
         main = "Pearson Correlation Heatmap (Normalized Counts)", 
         color = color_palette,
         clustering_distance_rows = "correlation", 
         clustering_distance_cols = "correlation", 
         border_color = NA,
         display_numbers = FALSE,
         fontsize_row = 8,
         fontsize_col = 9,
         annotation_col = annotation_col,  # Strain, Glands, Sex
         fontsize = 10)

  1. Euclidean Distance
# Calculated Euclidean distance matrix
dist_matrix_norm <- dist(t(log_cpm_norm), method = "euclidean")
dist_matrix_full_norm <- as.matrix(dist_matrix_norm)

# Plot heatmap with annotations, used same color palette as pearson heatmap
pheatmap(dist_matrix_full_norm, 
         main = "Euclidean Distance Heatmap (Normalized Counts)", 
         color = color_palette,
         border_color = NA,
         display_numbers = FALSE,
         fontsize_row = 8,
         fontsize_col = 9,
         annotation_col = annotation_col,  # Strain, Glands, Sex
         fontsize = 10)

  1. PCA Plot
# Define color palette for Glands and Sex
strain_colors <- c("Par" = "cyan2", "SL" = "seagreen1", "SM" = "lightcoral") # Assign colors to each gland type
sex_colors <- c("Male" = "purple", "Female" = "red") # Colors for Sex

# Performed PCA
pca_norm <- prcomp(t(log_cpm_norm), scale. = TRUE)

# Extracted PCA data and calculate variance explained
pca_data_norm <- as.data.frame(pca_norm$x)
percentVar_norm <- round(100 * (pca_norm$sdev^2 / sum(pca_norm$sdev^2)), 1)

# Added sample metadata to PCA results
pca_data_norm$SampleID <- rownames(pca_data_norm)
pca_data_norm$Strain <- metadata$Strain[match(pca_data_norm$SampleID, metadata$Sample)]
pca_data_norm$Glands <- metadata$Glands[match(pca_data_norm$SampleID, metadata$Sample)]
pca_data_norm$Sex <- metadata$Sex[match(pca_data_norm$SampleID, metadata$Sample)]

# Generated PCA plot and vizualized it using ggplot2
ggplot(pca_data_norm, aes(x = PC1, y = PC2, color = Glands, shape = Glands)) +
  geom_point(size = 3) +
#  geom_text_repel(size = 3, fontface = "bold", max.overlaps = 10, box.padding = 0.3, point.padding = 0.5) +
  scale_color_manual(values = strain_colors) +
  xlab(paste0("PC1: ", percentVar_norm[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar_norm[2], "% variance")) +
  labs(title = "PCA Plot of Gene Expression Data (Normalized Counts)") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10),
    strip.text = element_text(size = 12, face = "bold"),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.position = "right"
  ) +
  guides(color = guide_legend(title = "Glands"),
         shape = guide_legend(title = "Glands")) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  geom_mark_ellipse(aes(fill = Sex), alpha = 0.1, color = NA) + 
  facet_wrap(~ Strain) + # Facet by Strain
  scale_fill_manual(values = sex_colors) + 
  theme(panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_blank(),
        panel.border = element_blank())

4) Set Up the Design Matrix and Contrasts

# Converted the Glands column in metadata to a factor
metadata$Glands <- factor(metadata$Glands)

# Viewed the levels of Glands
levels(metadata$Glands)
## [1] "Par" "SL"  "SM"
# Set up the design matrix without an intercept (for pairwise comparisons)
design <- model.matrix(~ 0 + Glands, data = metadata)
colnames(design) <- levels(metadata$Glands)  # Name columns as Glands

# Displayed the first 60 rows of the design matrix
kable(head(design, 55), digits=5) %>% kable_styling("striped", full_width = FALSE) %>% scroll_box(width = "32%", height = "500px")
Par SL SM
Par_1A_Mal 1 0 0
Par_1B_Mal 1 0 0
Par_2A_Mal 1 0 0
Par_2B_Mal 1 0 0
Par_3A_Mal 1 0 0
Par_3B_Mal 1 0 0
Par_4A_Fem 1 0 0
Par_4B_Fem 1 0 0
Par_5A_Fem 1 0 0
Par_5B_Fem 1 0 0
Par_6A_Fem 1 0 0
Par_6B_Fem 1 0 0
SL_1A_Mal 0 1 0
SL_1B_Mal 0 1 0
SL_2A_Mal 0 1 0
SL_2B_Mal 0 1 0
SL_3A_Mal 0 1 0
SL_3B_Mal 0 1 0
SL_4A_Fem 0 1 0
SL_4B_Fem 0 1 0
SL_5A_Fem 0 1 0
SL_5B_Fem 0 1 0
SL_6A_Fem 0 1 0
SL_6B_Fem 0 1 0
SM_1A_Mal 0 0 1
SM_1B_Mal 0 0 1
SM_2A_Mal 0 0 1
SM_2B_Mal 0 0 1
SM_3A_Mal 0 0 1
SM_3B_Mal 0 0 1
SM_4A_Fem 0 0 1
SM_4B_Fem 0 0 1
SM_5A_Fem 0 0 1
SM_5B_Fem 0 0 1
SM_6A_Fem 0 0 1
SM_6B_Fem 0 0 1

5) Estimate Dispersion and Fit the Model

# Assigned the design matrix to the DGEList object and estimated dispersion
y <- estimateDisp(y, design)

# Plotted BCV (Biological Coefficient of Variation) to assess dispersion estimates
plotBCV(y)

# Fitted the generalized linear model using the quasi-likelihood method
fit_qlf <- glmQLFit(y, design)

Define Contrasts for Pairwise Comparisons

# Defined the contrasts 
contrast_matrix <- makeContrasts(
  Par_vs_SL = Par - SL,
  Par_vs_SM = Par - SM,
  SL_vs_SM  = SL - SM,
  levels = design
)

# View the contrast matrix
head(contrast_matrix)
##       Contrasts
## Levels Par_vs_SL Par_vs_SM SL_vs_SM
##    Par         1         1        0
##    SL         -1         0        1
##    SM          0        -1       -1

7) Performing Differential Expression Tests

Using the contrasts to perform the quasi-likelihood F-tests for each comparison. Function “glmQLFTest” performs quasi-likelihood F-test for each contrast

# quasi-likelihood F-test were performed for each contrast individually

# Contrast 1: Par vs SL
qlf_Par_vs_SL <- glmQLFTest(fit_qlf, contrast = contrast_matrix[,"Par_vs_SL"])

# Contrast 2: Par vs SM
qlf_Par_vs_SM <- glmQLFTest(fit_qlf, contrast = contrast_matrix[,"Par_vs_SM"])

# Contrast 3: SL vs SM
qlf_SL_vs_SM <- glmQLFTest(fit_qlf, contrast = contrast_matrix[,"SL_vs_SM"])

Identify Upregulated and Downregulated Genes for Each Contrast

topTags function in edgeR extracts and organizes differentially expressed genes from statistical test results.

# Number of genes to display/save was set (n=Inf to include all genes)
n_genes <- Inf

# Par vs SL
topTags_Par_vs_SL <- topTags(qlf_Par_vs_SL, n = n_genes) #A variable topTags_Par_vs_SL is created where the        extracted top genes are stored. 

results_Par_vs_SL <- topTags_Par_vs_SL$table #results_Par_vs_SL variable holds table from topTags object
results_Par_vs_SL$Gene_names <- rownames(results_Par_vs_SL) #Rownames that contains gene names now have their own column called gene names. This column will help during vizualization.


# Par vs SM
topTags_Par_vs_SM <- topTags(qlf_Par_vs_SM, n = n_genes)
results_Par_vs_SM <- topTags_Par_vs_SM$table
results_Par_vs_SM$Gene_names <- rownames(results_Par_vs_SM)


# SL vs SM
topTags_SL_vs_SM <- topTags(qlf_SL_vs_SM, n = n_genes)
results_SL_vs_SM <- topTags_SL_vs_SM$table
results_SL_vs_SM$Gene_names <- rownames(results_SL_vs_SM)

FDR of 5% is examined with the function “decideTests”

# Par vs SL
kable(head(results_Par_vs_SL, 20), digits=50, caption = "Top 20 DE Genes: Par vs SL") %>%
  kable_styling("striped", full_width = FALSE, position = "center")  %>% scroll_box(width = "80%", height = "500px")
Top 20 DE Genes: Par vs SL
logFC logCPM F PValue FDR Gene_names
Tmco3 -4.155307 7.0623389 919.5926 2.216414e-27 2.202153e-23 Tmco3
Cldn24 7.144283 1.4461368 384.7686 4.570061e-27 2.202153e-23 Cldn24
Nkx2-3 -6.701885 5.4486211 885.1794 5.173102e-27 2.202153e-23 Nkx2-3
Sox2 -6.312474 4.9650560 879.0124 5.741875e-27 2.202153e-23 Sox2
Galnt4 -3.587133 6.5218741 798.5773 2.594106e-26 7.959235e-23 Galnt4
Casc4 -4.808568 5.8041599 777.0299 4.198472e-26 1.073479e-22 Casc4
Isl2 -6.851417 3.1653075 711.9135 1.519104e-25 3.329225e-22 Isl2
Hpse2 -6.216574 2.0492147 572.3787 1.949171e-25 3.651888e-22 Hpse2
Liph -5.637708 5.9470991 707.4812 2.142428e-25 3.651888e-22 Liph
Bace2 -3.638722 5.9502284 684.0296 3.793755e-25 5.337132e-22 Bace2
Syt7 -5.785481 6.2686885 679.6713 4.311307e-25 5.337132e-22 Syt7
St3gal6 6.021131 6.0325314 678.8974 4.346841e-25 5.337132e-22 St3gal6
Tgoln1 -3.065223 7.5018066 677.0345 4.522698e-25 5.337132e-22 Tgoln1
Gcc2 -2.140715 5.5046809 663.5392 6.400906e-25 7.014022e-22 Gcc2
Vwf -3.636105 6.3349891 655.6737 7.867471e-25 8.046324e-22 Vwf
Ptprt -5.990776 1.8541562 550.9769 1.126969e-24 1.080552e-21 Ptprt
Heatr5a -2.458494 5.7820960 619.3447 2.094741e-24 1.890319e-21 Heatr5a
Slc44a4 -5.322553 6.4317854 614.5393 2.400135e-24 2.045582e-21 Slc44a4
A530006G24Rik 7.173198 0.5372368 231.9692 7.903342e-24 6.381325e-21 A530006G24Rik
Bcan -6.412518 2.8853652 564.1640 8.788216e-24 6.741001e-21 Bcan
# This summary was based on significance determined by FDR 
summary(decideTests(results_Par_vs_SL$FDR))
##        [,1]
## NotSig 6853
## Sig    8488
# Par vs SM
kable(head(results_Par_vs_SM, 20), digits=50, caption = "Top 20 DE Genes: Par vs SM") %>%
  kable_styling("striped", full_width = FALSE, position = "center")  %>% scroll_box(width = "80%", height = "500px")
Top 20 DE Genes: Par vs SM
logFC logCPM F PValue FDR Gene_names
Cldn24 7.387387 1.4461368 349.1460 5.082161e-26 7.796543e-22 Cldn24
St3gal6 5.411942 6.0325314 579.1408 6.664753e-24 5.112199e-20 St3gal6
Galnt6 4.461964 5.4128654 525.4032 3.517193e-23 1.798575e-19 Galnt6
Tlx1 -5.899304 4.9885551 520.1044 4.720355e-23 1.810374e-19 Tlx1
Ggh 5.387791 10.5375660 450.5100 4.700715e-22 1.442273e-18 Ggh
Cldn22 6.680395 6.5127189 441.7255 6.678743e-22 1.707643e-18 Cldn22
C330024C12Rik 6.659246 0.6463565 207.3046 8.483972e-22 1.859323e-18 C330024C12Rik
A530006G24Rik 6.702542 0.5372368 189.9868 1.479490e-21 2.837107e-18 A530006G24Rik
Csn3 6.647739 3.5915714 410.6277 3.374339e-21 5.751748e-18 Csn3
Prb1 7.087445 6.4659955 363.1883 1.779018e-20 2.729191e-17 Prb1
Liph -3.668593 5.9470991 356.7811 2.320996e-20 3.236945e-17 Liph
Shisa2 4.739191 4.4588110 354.7569 2.570511e-20 3.286184e-17 Shisa2
B4galnt3 5.972241 4.2350210 351.8476 3.229659e-20 3.811246e-17 B4galnt3
Klhl14 5.578961 -0.4543305 160.5617 4.888843e-20 5.357124e-17 Klhl14
Bpifb4 6.108430 0.5093333 220.6385 1.039515e-19 1.052623e-16 Bpifb4
Gm8882 6.732739 2.6042303 308.8407 1.097840e-19 1.052623e-16 Gm8882
Gm9994 6.947403 2.1213282 234.1878 2.274457e-19 2.052497e-16 Gm9994
1700066N21Rik 6.913182 9.6857686 308.2725 2.529098e-19 2.155494e-16 1700066N21Rik
4930426D05Rik 5.755185 0.2724439 159.0632 3.417759e-19 2.759571e-16 4930426D05Rik
Cst10 7.015809 11.7977784 300.9220 3.744712e-19 2.872381e-16 Cst10
# This summary was based on significance determined by FDR 
summary(decideTests(results_Par_vs_SM$FDR))
##        [,1]
## NotSig 8851
## Sig    6490
# SL vs SM
kable(head(results_SL_vs_SM, 20), digits=50, caption = "Top 20 DE Genes: SL vs SM") %>%
  kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "80%", height = "500px")
Top 20 DE Genes: SL vs SM
logFC logCPM F PValue FDR Gene_names
Galnt4 4.157584 6.521874 1015.6554 3.893774e-28 5.973438e-24 Galnt4
Nkx2-3 6.850034 5.448621 910.6768 3.157096e-27 2.421650e-23 Nkx2-3
Casc4 5.082725 5.804160 843.9180 9.989003e-27 4.991333e-23 Casc4
Sox2 6.084940 4.965056 838.5557 1.301436e-26 4.991333e-23 Sox2
Vwf 4.118477 6.334989 803.0749 2.353816e-26 7.221979e-23 Vwf
Bace2 3.945412 5.950228 780.6508 3.852107e-26 9.849197e-23 Bace2
Syt7 6.328771 6.268689 770.2607 4.951868e-26 1.085237e-22 Syt7
Hpse2 6.418600 2.049215 588.9173 1.138432e-25 2.057243e-22 Hpse2
Galnt12 5.242333 6.494604 730.9068 1.206909e-25 2.057243e-22 Galnt12
Tmco3 3.554465 7.062339 712.4616 1.874623e-25 2.875859e-22 Tmco3
Isl2 6.382053 3.165308 659.0448 5.797170e-25 8.084945e-22 Isl2
Fgl2 4.696106 6.467270 652.5791 8.537817e-25 1.026436e-21 Fgl2
Dnajc10 4.780718 8.340781 651.8237 8.698039e-25 1.026436e-21 Dnajc10
Tgoln1 2.822574 7.501807 586.2337 5.372613e-24 5.887233e-21 Tgoln1
Bcan 6.328490 2.885365 556.0463 1.126810e-23 1.152426e-20 Bcan
Foxp2 5.290392 2.163991 559.4940 1.582571e-23 1.517389e-20 Foxp2
Tmc5 6.253124 6.156531 531.2174 2.980537e-23 2.689672e-20 Tmc5
B3gnt6 6.822907 7.325964 510.0197 5.822527e-23 4.962411e-20 B3gnt6
Kcne1 5.814247 6.287985 488.3988 1.222004e-22 9.866717e-20 Kcne1
Galnt6 4.243983 5.412865 484.9055 1.370746e-22 1.051430e-19 Galnt6
# This summary was based on significance determined by FDR 
summary(decideTests(results_SL_vs_SM$FDR))
##        [,1]
## NotSig 9987
## Sig    5354

Identifying upregulated and downregulated genes

  1. Par vs SL
# Upregulated genes were identified following the filtering parameters of logFC ≥ 1 and FDR < 0.05
upregulated_Par_vs_SL <- results_Par_vs_SL[
  results_Par_vs_SL$FDR < 0.05 & results_Par_vs_SL$logFC >= 1, ]

# Downregulated genes were identified following the filtering parameters of logFC ≤ 1 and FDR < 0.05
downregulated_Par_vs_SL <- results_Par_vs_SL[
  results_Par_vs_SL$FDR < 0.05 & results_Par_vs_SL$logFC <= -1, ]

# Top 20 upregulated genes were displayed

kable(head(upregulated_Par_vs_SL, 20), digits = 50, caption = "Top 20 Upregulated Genes: Par vs SL") %>%
  kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "80%", height = "500px")
Top 20 Upregulated Genes: Par vs SL
logFC logCPM F PValue FDR Gene_names
Cldn24 7.144283 1.4461368 384.7686 4.570061e-27 2.202153e-23 Cldn24
St3gal6 6.021131 6.0325314 678.8974 4.346841e-25 5.337132e-22 St3gal6
A530006G24Rik 7.173198 0.5372368 231.9692 7.903342e-24 6.381325e-21 A530006G24Rik
C330024C12Rik 6.963839 0.6463565 247.1584 1.017724e-23 7.434717e-21 C330024C12Rik
Foxred2 4.115343 4.0718214 554.9747 1.379563e-23 8.818285e-21 Foxred2
Klhl14 6.518920 -0.4543305 215.4948 2.226527e-23 1.265080e-20 Klhl14
4930426D05Rik 7.631623 0.2724439 225.7695 6.506348e-23 3.441858e-20 4930426D05Rik
Ggh 5.765634 10.5375660 497.8351 8.698413e-23 4.170074e-20 Ggh
Cldn22 7.185898 6.5127189 490.9728 1.123886e-22 5.071039e-20 Cldn22
Csn3 7.458079 3.5915714 488.2706 1.903241e-22 8.342177e-20 Csn3
Prb1 7.698074 6.4659955 407.9935 2.578007e-21 9.197489e-19 Prb1
Gm9994 7.556486 2.1213282 276.8395 9.385634e-21 2.879700e-18 Gm9994
Crabp2 6.524720 3.2424634 377.3742 1.323304e-20 3.798276e-18 Crabp2
Man2a2 2.662444 5.2737581 368.2397 1.363867e-20 3.804198e-18 Man2a2
Gm8882 6.750530 2.6042303 331.3574 3.360730e-20 8.055774e-18 Gm8882
3110099E03Rik 7.270441 3.8313719 350.9954 4.196144e-20 9.903546e-18 3110099E03Rik
Susd3 3.282256 2.5715994 334.8271 6.783510e-20 1.553221e-17 Susd3
1700066N21Rik 7.160861 9.6857686 323.7933 1.134104e-19 2.485469e-17 1700066N21Rik
Slc25a21 3.521792 4.0678813 318.8254 1.466527e-19 3.124722e-17 Slc25a21
Svs6 5.563076 -1.1507408 163.5616 2.202092e-19 4.387311e-17 Svs6
# Top 20 downregulated genes were displayed

kable(head(downregulated_Par_vs_SL, 20), digits = 50, caption = "Top 20 Downregulated Genes: Par vs SL") %>%
  kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "80%", height = "500px")
Top 20 Downregulated Genes: Par vs SL
logFC logCPM F PValue FDR Gene_names
Tmco3 -4.155307 7.062339 919.5926 2.216414e-27 2.202153e-23 Tmco3
Nkx2-3 -6.701885 5.448621 885.1794 5.173102e-27 2.202153e-23 Nkx2-3
Sox2 -6.312474 4.965056 879.0124 5.741875e-27 2.202153e-23 Sox2
Galnt4 -3.587133 6.521874 798.5773 2.594106e-26 7.959235e-23 Galnt4
Casc4 -4.808568 5.804160 777.0299 4.198472e-26 1.073479e-22 Casc4
Isl2 -6.851417 3.165308 711.9135 1.519104e-25 3.329225e-22 Isl2
Hpse2 -6.216574 2.049215 572.3787 1.949171e-25 3.651888e-22 Hpse2
Liph -5.637708 5.947099 707.4812 2.142428e-25 3.651888e-22 Liph
Bace2 -3.638722 5.950228 684.0296 3.793755e-25 5.337132e-22 Bace2
Syt7 -5.785481 6.268689 679.6713 4.311307e-25 5.337132e-22 Syt7
Tgoln1 -3.065223 7.501807 677.0345 4.522698e-25 5.337132e-22 Tgoln1
Gcc2 -2.140715 5.504681 663.5392 6.400906e-25 7.014022e-22 Gcc2
Vwf -3.636105 6.334989 655.6737 7.867471e-25 8.046324e-22 Vwf
Ptprt -5.990776 1.854156 550.9769 1.126969e-24 1.080552e-21 Ptprt
Heatr5a -2.458494 5.782096 619.3447 2.094741e-24 1.890319e-21 Heatr5a
Slc44a4 -5.322553 6.431785 614.5393 2.400135e-24 2.045582e-21 Slc44a4
Bcan -6.412518 2.885365 564.1640 8.788216e-24 6.741001e-21 Bcan
Tmc5 -6.528392 6.156531 563.0624 1.105612e-23 7.566516e-21 Tmc5
Sec23b -3.519258 8.572684 561.1589 1.134410e-23 7.566516e-21 Sec23b
Dnajc10 -4.294980 8.340781 551.4164 1.529635e-23 9.386453e-21 Dnajc10
  1. Par vs SM
# Upregulated genes were identified following the filtering parameters of logFC ≥ 1 and FDR < 0.05
upregulated_Par_vs_SM <- results_Par_vs_SM[
  results_Par_vs_SM$FDR < 0.05 & results_Par_vs_SM$logFC >= 1, ]

# Downregulated genes were identified following the filtering parameters of logFC ≤ 1 and FDR < 0.05
downregulated_Par_vs_SM <- results_Par_vs_SM[
  results_Par_vs_SM$FDR < 0.05 & results_Par_vs_SM$logFC <= -1, ]

# Top 20 upregulated genes were displayed

kable(head(upregulated_Par_vs_SM, 20), digits = 50, caption = "Top 20 Upregulated Genes: Par vs SM") %>%
  kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "80%", height = "500px")
Top 20 Upregulated Genes: Par vs SM
logFC logCPM F PValue FDR Gene_names
Cldn24 7.387387 1.4461368 349.1460 5.082161e-26 7.796543e-22 Cldn24
St3gal6 5.411942 6.0325314 579.1408 6.664753e-24 5.112199e-20 St3gal6
Galnt6 4.461964 5.4128654 525.4032 3.517193e-23 1.798575e-19 Galnt6
Ggh 5.387791 10.5375660 450.5100 4.700715e-22 1.442273e-18 Ggh
Cldn22 6.680395 6.5127189 441.7255 6.678743e-22 1.707643e-18 Cldn22
C330024C12Rik 6.659246 0.6463565 207.3046 8.483972e-22 1.859323e-18 C330024C12Rik
A530006G24Rik 6.702542 0.5372368 189.9868 1.479490e-21 2.837107e-18 A530006G24Rik
Csn3 6.647739 3.5915714 410.6277 3.374339e-21 5.751748e-18 Csn3
Prb1 7.087445 6.4659955 363.1883 1.779018e-20 2.729191e-17 Prb1
Shisa2 4.739191 4.4588110 354.7569 2.570511e-20 3.286184e-17 Shisa2
B4galnt3 5.972241 4.2350210 351.8476 3.229659e-20 3.811246e-17 B4galnt3
Klhl14 5.578961 -0.4543305 160.5617 4.888843e-20 5.357124e-17 Klhl14
Bpifb4 6.108430 0.5093333 220.6385 1.039515e-19 1.052623e-16 Bpifb4
Gm8882 6.732739 2.6042303 308.8407 1.097840e-19 1.052623e-16 Gm8882
Gm9994 6.947403 2.1213282 234.1878 2.274457e-19 2.052497e-16 Gm9994
1700066N21Rik 6.913182 9.6857686 308.2725 2.529098e-19 2.155494e-16 1700066N21Rik
4930426D05Rik 5.755185 0.2724439 159.0632 3.417759e-19 2.759571e-16 4930426D05Rik
Cst10 7.015809 11.7977784 300.9220 3.744712e-19 2.872381e-16 Cst10
Foxred2 2.849927 4.0718214 298.4485 4.301921e-19 3.142656e-16 Foxred2
Dcpp1 5.834276 12.7182491 290.0334 6.806902e-19 4.351029e-16 Dcpp1
# Top 20 downregulated genes were displayed

kable(head(downregulated_Par_vs_SM, 20), digits = 50, caption = "Top 20 Downregulated Genes: Par vs SM") %>%
  kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "80%", height = "500px")
Top 20 Downregulated Genes: Par vs SM
logFC logCPM F PValue FDR Gene_names
Tlx1 -5.899304 4.9885551 520.1044 4.720355e-23 1.810374e-19 Tlx1
Liph -3.668593 5.9470991 356.7811 2.320996e-20 3.236945e-17 Liph
Ptprt -4.100384 1.8541562 264.3664 5.238903e-19 3.653182e-16 Ptprt
Gpr155 -5.056716 7.1810574 290.7018 6.563266e-19 4.351029e-16 Gpr155
Duoxa2 -6.383686 3.6682549 288.9538 7.995886e-19 4.732304e-16 Duoxa2
Duox2 -6.184777 4.5472542 263.2697 3.628761e-18 1.590538e-15 Duox2
Kcnj3 -5.479426 -0.6783567 166.5889 4.667624e-18 1.935298e-15 Kcnj3
Dstyk -1.570262 4.8940996 244.7219 1.036362e-17 4.076622e-15 Dstyk
Mal2 -3.007438 4.5782783 230.2453 2.719818e-17 9.482892e-15 Mal2
Vps33b -1.239887 4.5192099 229.4735 2.864251e-17 9.764550e-15 Vps33b
Crim1 -1.725672 5.6549675 226.6257 3.484749e-17 1.162164e-14 Crim1
Fmn1 -2.657240 4.7880313 213.4424 8.906618e-17 2.829331e-14 Fmn1
Ccdc68 -2.861194 3.2773613 213.4754 9.037040e-17 2.829331e-14 Ccdc68
Ppm1l -2.253322 3.6660778 208.3366 1.301410e-16 3.914692e-14 Ppm1l
Pcp4l1 -3.479140 4.9022820 207.4385 1.388697e-16 4.096923e-14 Pcp4l1
Arhgef10 -1.913454 6.2555648 204.3525 1.749485e-16 4.970156e-14 Arhgef10
Esrrg -4.322322 6.1235405 203.6277 1.850073e-16 5.068208e-14 Esrrg
Rnf141 -1.720475 4.3749706 193.0026 4.225180e-16 1.098619e-13 Rnf141
Pde4d -1.627661 5.0089214 188.8787 5.879873e-16 1.478740e-13 Pde4d
Crlf1 -3.428918 5.3052569 181.4193 1.093816e-15 2.663529e-13 Crlf1
  1. SL vs SM
# Upregulated genes were identified following the filtering parameters of logFC ≥ 1 and FDR < 0.05
upregulated_SL_vs_SM <- results_SL_vs_SM[
  results_SL_vs_SM$FDR < 0.05 & results_SL_vs_SM$logFC >= 1, ]

# Downregulated genes were identified following the filtering parameters of logFC ≤ 1 and FDR < 0.05
downregulated_SL_vs_SM <- results_SL_vs_SM[
  results_SL_vs_SM$FDR < 0.05 & results_SL_vs_SM$logFC <= -1, ]

# Top 20 upregulated genes were displayed

kable(head(upregulated_SL_vs_SM, 20), digits = 50, caption = "Top 20 Upregulated Genes: SL vs SM") %>%
  kable_styling("striped", full_width = FALSE) %>% scroll_box(width = "80%", height = "500px")
Top 20 Upregulated Genes: SL vs SM
logFC logCPM F PValue FDR Gene_names
Galnt4 4.157584 6.521874 1015.6554 3.893774e-28 5.973438e-24 Galnt4
Nkx2-3 6.850034 5.448621 910.6768 3.157096e-27 2.421650e-23 Nkx2-3
Casc4 5.082725 5.804160 843.9180 9.989003e-27 4.991333e-23 Casc4
Sox2 6.084940 4.965056 838.5557 1.301436e-26 4.991333e-23 Sox2
Vwf 4.118477 6.334989 803.0749 2.353816e-26 7.221979e-23 Vwf
Bace2 3.945412 5.950228 780.6508 3.852107e-26 9.849197e-23 Bace2
Syt7 6.328771 6.268689 770.2607 4.951868e-26 1.085237e-22 Syt7
Hpse2 6.418600 2.049215 588.9173 1.138432e-25 2.057243e-22 Hpse2
Galnt12 5.242333 6.494604 730.9068 1.206909e-25 2.057243e-22 Galnt12
Tmco3 3.554465 7.062339 712.4616 1.874623e-25 2.875859e-22 Tmco3
Isl2 6.382053 3.165308 659.0448 5.797170e-25 8.084945e-22 Isl2
Fgl2 4.696106 6.467270 652.5791 8.537817e-25 1.026436e-21 Fgl2
Dnajc10 4.780718 8.340781 651.8237 8.698039e-25 1.026436e-21 Dnajc10
Tgoln1 2.822574 7.501807 586.2337 5.372613e-24 5.887233e-21 Tgoln1
Bcan 6.328490 2.885365 556.0463 1.126810e-23 1.152426e-20 Bcan
Foxp2 5.290392 2.163991 559.4940 1.582571e-23 1.517389e-20 Foxp2
Tmc5 6.253124 6.156531 531.2174 2.980537e-23 2.689672e-20 Tmc5
B3gnt6 6.822907 7.325964 510.0197 5.822527e-23 4.962411e-20 B3gnt6
Kcne1 5.814247 6.287985 488.3988 1.222004e-22 9.866717e-20 Kcne1
Galnt6 4.243983 5.412865 484.9055 1.370746e-22 1.051430e-19 Galnt6
# Top 20 downregulated genes were displayed

kable(head(downregulated_SL_vs_SM, 20), digits = 50, caption = "Top 20 Downregulated Genes: SL vs SM") %>%
  kable_styling("striped", full_width = FALSE) %>% scroll_box(width = "80%", height = "500px")
Top 20 Downregulated Genes: SL vs SM
logFC logCPM F PValue FDR Gene_names
Srsf10 -1.351846 6.6333619 249.8339 7.460464e-18 2.043767e-15 Srsf10
Ttc7 -2.333207 5.8326110 215.9233 7.431902e-17 1.443200e-14 Ttc7
Ampd2 -1.713203 4.1361510 196.7664 3.138833e-16 5.234005e-14 Ampd2
Fam53b -1.483525 5.1604965 191.4929 4.762507e-16 7.455268e-14 Fam53b
Ttll12 -1.298845 3.5230415 159.1228 7.812595e-15 7.990201e-13 Ttll12
Slc25a12 -1.473749 5.7097686 154.9362 1.159215e-14 1.111470e-12 Slc25a12
Dleu7 -6.190975 -0.1335977 113.5056 1.422493e-14 1.330638e-12 Dleu7
Kcnj3 -3.295787 -0.6783567 109.4816 1.518714e-14 1.386821e-12 Kcnj3
Ero1lb -2.429578 6.6603718 151.3293 1.641210e-14 1.481047e-12 Ero1lb
Rcan2 -2.985086 2.5220846 142.2988 4.082663e-14 3.245189e-12 Rcan2
Mid2 -2.674937 3.7878651 138.9612 5.709815e-14 4.357924e-12 Mid2
Hepacam2 -2.558095 3.1546787 135.6365 8.124405e-14 5.851479e-12 Hepacam2
Agtpbp1 -1.330973 3.6246609 134.9155 8.744998e-14 6.239861e-12 Agtpbp1
Tfeb -1.416673 4.7292016 131.0261 1.330074e-13 9.068743e-12 Tfeb
Oit1 -3.093703 7.1624354 130.7650 1.368571e-13 9.168230e-12 Oit1
BC017612 -2.366891 6.8426447 130.2717 1.444579e-13 9.511283e-12 BC017612
Mpp6 -1.569008 4.8724220 126.2015 2.270727e-13 1.433548e-11 Mpp6
Derl3 -3.063186 7.5107406 125.2139 2.538426e-13 1.585946e-11 Derl3
Adrb1 -3.426786 5.0246723 124.5096 2.750620e-13 1.701503e-11 Adrb1
Adtrp -4.098484 4.3284122 122.5405 3.458436e-13 2.061027e-11 Adtrp

DEG counts:

Par vs SL: Number of Upregulated Genes: 2483

Par vs SL: Number of Downregulated Genes: 1454

Par vs SM: Number of Upregulated Genes: 2135

Par vs SM: Number of Downregulated Genes: 1027

SL vs SM: Number of Upregulated Genes: 1082

SL vs SM: Number of Downregulated Genes: 1082

8) Vizualization plots

  1. MA plots
  1. Par vs SL
# Defined thresholds for significance
fdr_threshold <- 0.05
logfc_threshold <- 1

# Selected the results for Par vs SL comparison
top_genes_Par_vs_SL <- results_Par_vs_SL

# Classified genes based on logFC and FDR
top_genes_Par_vs_SL$Significance <- "Not Significant"
top_genes_Par_vs_SL$Significance[top_genes_Par_vs_SL$logFC >= logfc_threshold & top_genes_Par_vs_SL$FDR < fdr_threshold] <- "Upregulated"
top_genes_Par_vs_SL$Significance[top_genes_Par_vs_SL$logFC <= -logfc_threshold & top_genes_Par_vs_SL$FDR < fdr_threshold] <- "Downregulated"

# Created an MA plot, colored the plot as per significance determined by the logfc and FDR thresholds
ggplot(top_genes_Par_vs_SL, aes(x = logCPM, y = logFC)) +
  geom_point(aes(color = Significance), alpha = 0.6, size = 1.5) +  
  scale_color_manual(values = c("Not Significant" = "grey", 
                                "Upregulated" = "purple", 
                                "Downregulated" = "pink")) +  # Custom colors for categories
  labs(title = "MA Plot - Par vs SM",
       x = "Average Log CPM",
       y = "Log Fold Change") +  
  theme_minimal() +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),  
    axis.title = element_text(size = 14),  
    legend.title = element_blank(),  
    legend.position = "top",  
    legend.text = element_text(size = 12)  
  ) + 
  geom_hline(yintercept = 0, linetype = "dashed", color = "black")  

  1. Volcano Plots
  1. Par_vs_SM
# Create the volcano plot with adjusted p-values (FDR) using EnhancedVolcano function from the EnhancedVolcano package
EnhancedVolcano(results_Par_vs_SM,
    lab = results_Par_vs_SM$Gene_names,                           # Labels for each gene
    x = 'logFC',                                                  # X-axis: log fold change
    y = 'FDR',                                                    # Y-axis: FDR (adjusted p-values)
    pCutoff = 0.05,                                               # Cutoff for FDR significance
    FCcutoff = 1,                                                 # Fold change cutoff
    title = 'Volcano Plot (Par vs  SM)',                           # Plot title
    xlim = c(-10, 10),                                            # Adjust logFC range if needed
    ylim = c(0, -log10(min(results_Par_vs_SM$FDR, na.rm=TRUE))),  # Adjust FDR y-axis
    pointSize = 1.0,                                              # Point size for each gene
    labSize = 3.0                                                 # Label size
)

  1. Heatmaps: Building heatmap based on FDR cut off 0.05
  1. SL vs SM
# Calculated normalized counts are stored in a variable normCounts
normCounts <- cpm(y, normalized.lib.sizes = TRUE, log = TRUE)

# From the topTags table, Filtered significant genes for SL vs SM comparison are stored in sigGenes_SL_vs_SM
sigGenes_SL_vs_SM <- topTags_SL_vs_SM$table
sigGenes_SL_vs_SM <- sigGenes_SL_vs_SM[sigGenes_SL_vs_SM$FDR < 0.05, ]
sigGenes_SL_vs_SM <- sigGenes_SL_vs_SM[order(sigGenes_SL_vs_SM$FDR), ]

# Selected relevant samples for the comparison
selected_samples_SL_vs_SM <- c("SL_1A_Mal",  "SL_1B_Mal",  "SL_2A_Mal",  "SL_2B_Mal",  "SL_3A_Mal",  "SL_3B_Mal", "SL_4A_Fem",  "SL_4B_Fem",  "SL_5A_Fem",  "SL_5B_Fem", "SL_6A_Fem",  "SL_6B_Fem","SM_1A_Mal",  "SM_1B_Mal",  "SM_2A_Mal",  "SM_2B_Mal",  "SM_3A_Mal",  "SM_3B_Mal", "SM_4A_Fem",  "SM_4B_Fem",  "SM_5A_Fem",  "SM_5B_Fem",  "SM_6A_Fem",  "SM_6B_Fem")

# Selecting our relevant samples from normCounts table and storing in another variable normCounts2
normCounts2 <- normCounts[, selected_samples_SL_vs_SM]

# Filtered to include only significant genes for SL_vs_SM
normCounts2 <- normCounts2[rownames(normCounts2) %in% rownames(sigGenes_SL_vs_SM), ]

# Selected top 50 significant genes for visualization
topSigGenes_SL_vs_SM <- head(normCounts2, 50)

# Defined strain annotation for SL and SM samples
strain_info_SL_vs_SM <- data.frame(
  Strain = c(rep("SL", 12), rep("SM", 12))
)

#Making the selected relevant samples as rownames
rownames(strain_info_SL_vs_SM) <- selected_samples_SL_vs_SM

# Defined colors for the strain annotation
strain_colors_SL_vs_SM <- list(Strain = c("SL" = "palegreen", "SM" = "paleturquoise1"))

# Generated the heatmap for the top significant genes for SL vs SM by using the function pheatmap from pheatmap package
pheatmap(topSigGenes_SL_vs_SM, scale = 'row', show_rownames = TRUE, treeheight_row = F, treeheight_col = 0, cluster_rows = TRUE, cluster_cols = FALSE, fontsize_row = 5, labels_col = selected_samples_SL_vs_SM, annotation_col = strain_info_SL_vs_SM, annotation_colors = strain_colors_SL_vs_SM, main = "Heatmap of SL vs SM (Top 50 genes)")

9) Pathway analysis

Gene Ontology: BP, CC, MF of Par vs SL

# Extracted the gene names from upregulated genes list and stored in a variable 'upregulated_genes_pathway_Par_vs_SL'
upregulated_genes_pathway_Par_vs_SL <- rownames(upregulated_Par_vs_SL)

# Extracted all genes tested in the differential expression analysis 
all_genes_pathway_Par_vs_SL <- rownames(results_Par_vs_SL)

# Convert gene symbols to Entrez IDs using org.Mm.eg.db
upregulated_entrez_Par_vs_SL <- bitr(upregulated_genes_pathway_Par_vs_SL, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)$ENTREZID
all_entrez_Par_vs_SL <- bitr(all_genes_pathway_Par_vs_SL, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)$ENTREZID

# Run GO enrichment analysis for each category (BP, CC, MF) by using enrichGO
ego_bp_Par_vs_SL <- enrichGO(
  gene          = upregulated_entrez_Par_vs_SL,
  universe      = all_entrez_Par_vs_SL,
  keyType       = "ENTREZID",
  OrgDb         = org.Mm.eg.db,
  ont           = "BP",
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

ego_cc_Par_vs_SL <- enrichGO(
  gene          = upregulated_entrez_Par_vs_SL,
  universe      = all_entrez_Par_vs_SL,
  keyType       = "ENTREZID",
  OrgDb         = org.Mm.eg.db,
  ont           = "CC",
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

ego_mf_Par_vs_SL <- enrichGO(
  gene          = upregulated_entrez_Par_vs_SL,
  universe      = all_entrez_Par_vs_SL,
  keyType       = "ENTREZID",
  OrgDb         = org.Mm.eg.db,
  ont           = "MF",
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

# Prepared the results for plotting by selecting top 15 terms for each category
top_bp_Par_vs_SL <- ego_bp_Par_vs_SL@result %>%
  arrange(p.adjust) %>%
  top_n(15, wt = -pvalue) %>%
  mutate(Category = "Biological Process", logPvalue = -log10(pvalue))

top_cc_Par_vs_SL <- ego_cc_Par_vs_SL@result %>%
  arrange(p.adjust) %>%
  top_n(15, wt = -pvalue) %>%
  mutate(Category = "Cellular Component", logPvalue = -log10(pvalue))

top_mf_Par_vs_SL <- ego_mf_Par_vs_SL@result %>%
  arrange(p.adjust) %>%
  top_n(15, wt = -pvalue) %>%
  mutate(Category = "Molecular Function", logPvalue = -log10(pvalue))

combined_results_Par_vs_SL <- bind_rows(top_bp_Par_vs_SL, top_cc_Par_vs_SL, top_mf_Par_vs_SL)

# Colorblind-friendly palette
cb_palette <- brewer.pal(n = 3, name = "Set2")

# Vizualized combined data in the form of barplots
ggplot(combined_results_Par_vs_SL, aes(x = reorder(Description, pvalue), y = logPvalue, fill = Category)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Category, scales = "free") +
  theme_minimal() +
  scale_fill_manual(values = cb_palette) +
  labs(
    x = "GO Term",
    y = "-log10(p-value)",
    title = "Top 15 Gene Function Classification (GO) for Upregulated Genes in Par vs SL"
  ) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1, size = 8),
    legend.title = element_blank(),
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  guides(fill = guide_legend(reverse = TRUE))

# Extracted the gene names from downregulated genes list and stored in a variable 'downregulated_genes_pathway_Par_vs_SL'
downregulated_genes_pathway_Par_vs_SL <- rownames(downregulated_Par_vs_SL)

# Convert gene symbols to Entrez IDs using org.Mm.eg.db
downregulated_entrez_Par_vs_SL <- bitr(downregulated_genes_pathway_Par_vs_SL, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)$ENTREZID
all_entrez_Par_vs_SL <- bitr(all_genes_pathway_Par_vs_SL, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)$ENTREZID

# Run GO enrichment analysis for each category (BP, CC, MF) by using enrichGO
ego_bp_down_Par_vs_SL <- enrichGO(
  gene          = downregulated_entrez_Par_vs_SL,
  universe      = all_entrez_Par_vs_SL,
  keyType       = "ENTREZID",
  OrgDb         = org.Mm.eg.db,
  ont           = "BP",
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

ego_cc_down_Par_vs_SL <- enrichGO(
  gene          = downregulated_entrez_Par_vs_SL,
  universe      = all_entrez_Par_vs_SL,
  keyType       = "ENTREZID",
  OrgDb         = org.Mm.eg.db,
  ont           = "CC",
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

ego_mf_down_Par_vs_SL <- enrichGO(
  gene          = downregulated_entrez_Par_vs_SL,
  universe      = all_entrez_Par_vs_SL,
  keyType       = "ENTREZID",
  OrgDb         = org.Mm.eg.db,
  ont           = "MF",
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

# Prepared the results for plotting
top_bp_down_Par_vs_SL <- ego_bp_down_Par_vs_SL@result %>%
  arrange(p.adjust) %>%
  top_n(15, wt = -pvalue) %>%
  mutate(Category = "Biological Process", logPvalue = -log10(pvalue))

top_cc_down_Par_vs_SL <- ego_cc_down_Par_vs_SL@result %>%
  arrange(p.adjust) %>%
  top_n(15, wt = -pvalue) %>%
  mutate(Category = "Cellular Component", logPvalue = -log10(pvalue))

top_mf_down_Par_vs_SL <- ego_mf_down_Par_vs_SL@result %>%
  arrange(p.adjust) %>%
  top_n(15, wt = -pvalue) %>%
  mutate(Category = "Molecular Function", logPvalue = -log10(pvalue))

combined_results_down_Par_vs_SL <- bind_rows(top_bp_down_Par_vs_SL, top_cc_down_Par_vs_SL, top_mf_down_Par_vs_SL)

# Vizualized combined data in the form of barplots
ggplot(combined_results_down_Par_vs_SL, aes(x = reorder(Description, pvalue), y = logPvalue, fill = Category)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Category, scales = "free") +
  theme_minimal() +
  scale_fill_manual(values = cb_palette) +
  labs(
    x = "GO Term",
    y = "-log10(p-value)",
    title = "Top 15 Gene Function Classification (GO) for Downregulated Genes in Par vs SL"
  ) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1, size = 8),
    legend.title = element_blank(),
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  guides(fill = guide_legend(reverse = TRUE))

KEGG Pathway: Par vs SM

# Extracted the gene names from upregulated genes list and stored in a variable 'upregulated_genes_pathway_Par_vs_SM'
upregulated_genes_pathway_Par_vs_SM <- rownames(upregulated_Par_vs_SM)

# Converted gene symbols to Entrez IDs using org.Mm.eg.db
upregulated_entrez_Par_vs_SM <- bitr(upregulated_genes_pathway_Par_vs_SM, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)$ENTREZID

# Run KEGG pathway enrichment analysis using enrichKEGG
kegg_up_Par_vs_SM <- enrichKEGG(
  gene          = upregulated_entrez_Par_vs_SM,
  organism      = 'mmu',               # Mouse KEGG pathway
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05
)

# Created the dotplot
dotplot <- dotplot(kegg_up_Par_vs_SM , showCategory = 20) + ggtitle("KEGG: Par vs SM upregulated genes")

# Adjusted font sizes for better readability
dotplot + theme(
  axis.text.y = element_text(size = 6.5),  # Y-axis text size
)

# Extracted the gene names from downregulated genes list and stored in a variable 'downregulated_genes_pathway_Par_vs_SM'
downregulated_genes_pathway_Par_vs_SM <- rownames(downregulated_Par_vs_SM)

# Converted gene symbols to Entrez IDs using org.Mm.eg.db
downregulated_entrez_Par_vs_SM <- bitr(downregulated_genes_pathway_Par_vs_SM, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)$ENTREZID

# Run KEGG pathway enrichment analysis using enrichKEGG
kegg_down_Par_vs_SM <- enrichKEGG(
  gene          = downregulated_entrez_Par_vs_SM,
  organism      = 'mmu',               # Mouse KEGG pathway (use 'hsa' for human)
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05
)

# Create the dotplot
dotplot1 <- dotplot(kegg_down_Par_vs_SM , showCategory = 20) + ggtitle("KEGG: Par vs SM Downregulated genes")

# Adjust font sizes using theme()
dotplot1 + theme(
  axis.text.y = element_text(size = 6.5),  # Y-axis text size
)

Reactome Pathway: SL vs SM

# Extracted the gene names from upregulated genes list and stored in a variable 'upregulated_genes_pathway_SL_vs_SM'
upregulated_genes_pathway_SL_vs_SM <- rownames(upregulated_SL_vs_SM)

# Converted gene symbols to Entrez IDs using org.Mm.eg.db
upregulated_entrez_SL_vs_SM <- bitr(upregulated_genes_pathway_SL_vs_SM, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)$ENTREZID

# Performed reactome pathway enrichment analysis using enrichPathway
reactome_up_SL_vs_SM <- enrichPathway(
  gene          = upregulated_entrez_SL_vs_SM,
  organism      = 'mouse',             # Use 'human' for human data
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

# Calculated pairwise similarities between enriched terms
reactome_up_SL_vs_SM1 <- pairwise_termsim(reactome_up_SL_vs_SM)

# Created a tree plot to visualize the enrichment results
treeplot <- treeplot(reactome_up_SL_vs_SM1) + ggtitle("Reactome Pathway Enrichment (Upregulated Genes in SL vs SM)")

#Printing the plot
print(treeplot)

- SL vs SM: Downregulated Genes

# Extracted the gene names from downregulated genes list and stored in a variable 'downregulated_genes_pathway_SL_vs_SM'
downregulated_genes_pathway_SL_vs_SM <- rownames(downregulated_SL_vs_SM)

# Converted gene symbols to Entrez IDs using org.Mm.eg.db
downregulated_entrez_SL_vs_SM <- bitr(downregulated_genes_pathway_SL_vs_SM, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)$ENTREZID

# Run Reactome pathway enrichment analysis using enrichPathway
reactome_down_SL_vs_SM <- enrichPathway(
  gene          = downregulated_entrez_SL_vs_SM,
  organism      = 'mouse',             # Use 'human' for human data
  pAdjustMethod = "BH",
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

# Calculated pairwise similarities between enriched terms
reactome_down_SL_vs_SM1 <- pairwise_termsim(reactome_down_SL_vs_SM)

# Created a tree plot to visualize the enrichment results
treeplot1 <- treeplot(reactome_down_SL_vs_SM1) + ggtitle("Reactome Pathway Enrichment (Downregulated Genes in SL vs SM)")

# Printing the plot
print(treeplot1)

10. Gene Set Enrichment Analysis

Preparing Gene Sets using Hallmark gene sets from MSigDB

# Retrieved Hallmark gene sets for Mus musculus
msig_hallmark <- msigdbr(species = "Mus musculus", category = "H")

# Prepared TERM2GENE as a data frame with two columns: "term" and "gene"
hallmark_gene_sets <- msig_hallmark %>%
  dplyr::select(gs_name, entrez_gene) %>%
  dplyr::rename(term = gs_name, gene = entrez_gene)

Generating Ranked Gene Lists

# Created a function to generate ranked gene lists
create_ranked_list <- function(de_results, gene_column = "Gene_names", logfc_column = "logFC") {
  # Ensured gene symbols are unique
  de_results <- de_results[!duplicated(de_results[[gene_column]]), ]
  
  # Removed any NA values
  de_results <- de_results[!is.na(de_results[[logfc_column]]), ]
  
  # Converted gene symbols to Entrez IDs
  gene_entrez <- bitr(de_results[[gene_column]],
                      fromType = "SYMBOL",
                      toType = "ENTREZID",
                      OrgDb = org.Mm.eg.db)
  
  # Merged with DE results
  de_results <- merge(de_results, gene_entrez, by.x = gene_column, by.y = "SYMBOL")
  
  # Removed any genes that failed to map
  de_results <- de_results[!is.na(de_results$ENTREZID), ]
  
  # Created a named vector for ranking
  ranked_list <- de_results$logFC
  names(ranked_list) <- de_results$ENTREZID
  
  # Removed duplicates and sort in decreasing order
  ranked_list <- ranked_list[!duplicated(names(ranked_list))]
  ranked_list <- sort(ranked_list, decreasing = TRUE)
  
  return(ranked_list)
}

Performing GSEA for Each Comparison

  1. GSEA: Par vs SL
# Created ranked list for Par_vs_SL
ranked_Par_vs_SL <- create_ranked_list(results_Par_vs_SL)

# Run GSEA using clusterProfiler's GSEA function
gsea_Par_vs_SL <- GSEA(
  ranked_Par_vs_SL,
  TERM2GENE = hallmark_gene_sets,
  pvalueCutoff = 0.05,
  eps = 0,            # Set eps to zero for better estimation
  verbose = FALSE
)

# Extracted pathways with P-value == 1e-10 (extremely significant)
extremely_significant_Par_vs_SL <- gsea_Par_vs_SL@result %>%
  filter(pvalue == 1e-10)

# Converted GSEA results to a data frame
gsea_results_df_Par_vs_SL <- as.data.frame(gsea_Par_vs_SL)

# Combineed top 15 pathways with extremely significant ones
combined_top_Par_vs_SL <- gsea_results_df_Par_vs_SL %>%
  arrange(p.adjust) %>%
  head(15) %>%
  bind_rows(extremely_significant_Par_vs_SL)

# Removed potential duplicates based on Description
combined_top_Par_vs_SL <- combined_top_Par_vs_SL %>%
  distinct(Description, .keep_all = TRUE)

# Created a bar plot for Par_vs_SL
ggplot(combined_top_Par_vs_SL, aes(x = reorder(Description, NES), y = NES, fill = NES)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_gradient(low = "lightblue", high = "lightcoral") +
  theme_minimal() +
  labs(title = "Top Enriched Gene Sets - Par_vs_SL",
       x = "Gene Set",
       y = "Normalized Enrichment Score (NES)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.text = element_text(size = 12)
  )

# Displayed the top 20 GSEA results in a scrollable table
kable(head(gsea_Par_vs_SL, 20), digits=50) %>% kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "100%", height = "500px")
ID Description setSize enrichmentScore NES pvalue p.adjust qvalue rank leading_edge core_enrichment
HALLMARK_ALLOGRAFT_REJECTION HALLMARK_ALLOGRAFT_REJECTION HALLMARK_ALLOGRAFT_REJECTION 172 0.7177226 2.249958 1.004867e-19 5.024334e-18 3.490590e-18 1760 tags=51%, list=13%, signal=45% 12525/12526/12518/15002/12500/12502/12503/16428/12501/16818/15001/12504/84544/66211/20299/55985/50931/12481/22637/20304/16186/12487/16678/21939/20849/16170/17329/23871/19264/26411/12516/16994/14423/17395/50723/22376/14537/16185/16408/18646/16643/16364/21838/16822/19073/56501/12766/12524/12010/17972/16149/14960/20452/12479/12519/18751/16174/16161/226419/15163/14938/16190/21897/16176/14191/16196/17076/79221/12703/12263/15170/16184/18636/16414/12444/21354/17084/15007/16173/12035/14130/17086/14998/14102/15976/14744/15900/21803
HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE 187 0.6304543 1.989881 4.685858e-12 1.171464e-10 8.138595e-11 2306 tags=48%, list=16%, signal=41% 71939/242248/14990/110168/12515/20304/12984/16421/21939/20849/12265/17329/76074/434341/17472/16185/16364/236573/16822/60533/108670/12524/12010/16149/22329/626578/209086/14960/20452/14969/12363/12702/16154/16913/14938/16190/16196/75345/246727/18578/27056/12703/15170/20344/16912/12369/15945/12258/229003/21928/27007/21354/14962/15007/11861/109032/66892/19225/14998/14102/246728/70082/15976/15900/667277/21930/12362/18712/55932/14964/58203/56791/65972/234311/20306/15894/56045/14528/22035/16963/15959/21929/74481/22271/54123/12575/19171/20296/15958/209588
HALLMARK_MYOGENESIS HALLMARK_MYOGENESIS HALLMARK_MYOGENESIS 176 0.6055251 1.904287 1.692797e-09 2.821328e-08 1.960081e-08 1982 tags=37%, list=14%, signal=32% 17879/19293/21957/17884/76722/21925/17901/59011/12715/11537/17930/11474/14077/11459/11937/21953/17907/15464/12862/12372/21393/13346/11472/20190/19309/17189/56012/24053/24131/14199/13009/20856/14778/20249/17929/24052/17896/11489/16511/16985/16404/12865/13808/12491/17260/23796/19245/56188/29817/17873/11568/16009/18563/18128/11636/16000/21803/12842/226251/12759/17882/11423/69253/52250/227753
HALLMARK_PROTEIN_SECRETION HALLMARK_PROTEIN_SECRETION HALLMARK_PROTEIN_SECRETION 94 -0.5899617 -2.255769 1.941524e-08 2.426906e-07 1.686061e-07 3709 tags=64%, list=26%, signal=47% 16561/22319/56418/211673/211286/15893/71770/20844/56041/11774/107338/22088/11487/20479/17113/99371/100226/67830/22365/72736/271457/59042/108664/208449/99889/12757/213827/14420/16784/11840/19334/59021/11765/67074/56334/54214/20333/320634/11605/67300/20619/16004/107767/76332/26951/12512/69162/70349/20404/53331/12725/11977/50797/16668/21985/69608/77929/70361/18484/71943
HALLMARK_INFLAMMATORY_RESPONSE HALLMARK_INFLAMMATORY_RESPONSE HALLMARK_INFLAMMATORY_RESPONSE 171 0.5631777 1.765002 1.018982e-06 1.018982e-05 7.079240e-06 2636 tags=46%, list=19%, signal=38% 17167/12775/20343/16818/20299/12515/20304/56696/21939/16197/17329/269346/19419/68713/321019/50723/21942/80901/16185/16182/16822/54483/18413/20339/16174/56221/54371/19734/16154/11988/12506/16190/21897/16176/16992/23796/18578/320207/21338/50778/50498/15117/19217/12986/15945/14676/21938/23845/19219/20511/16173/13732/17869/21930/54598/380713/15465/20306/15894/14528/22035/13058/13136/11303/14723/54123/16878/12575/56212/20296/16323/18414/20354/216799/20266/69550/54216/64008/11541
HALLMARK_ANDROGEN_RESPONSE HALLMARK_ANDROGEN_RESPONSE HALLMARK_ANDROGEN_RESPONSE 107 -0.4800640 -1.875840 1.721447e-05 1.434539e-04 9.966273e-05 2403 tags=42%, list=17%, signal=35% 109711/56228/74754/20393/56847/56376/20229/229055/319554/12007/71817/21807/66884/53416/15357/17761/17289/16691/19341/107652/16410/13714/21985/14595/69608/70361/231070/109785/13521/54608/74205/16617/16613/16623/16622/16615/18048/16619/12167/18050/16616/13648/13646/16624/30051
HALLMARK_IL6_JAK_STAT3_SIGNALING HALLMARK_IL6_JAK_STAT3_SIGNALING HALLMARK_IL6_JAK_STAT3_SIGNALING 80 0.5908909 1.732533 2.274794e-04 1.421746e-03 9.877396e-04 2540 tags=45%, list=18%, signal=37% 55985/12984/16186/16199/17329/16994/16182/14825/12491/12702/16161/16190/16176/16196/16178/320207/12703/50498/16184/12986/15945/21938/12804/14102/21803/18712/20306/16172/11482/56744/18414/12494/20846/54635/16401/16188
HALLMARK_TNFA_SIGNALING_VIA_NFKB HALLMARK_TNFA_SIGNALING_VIA_NFKB HALLMARK_TNFA_SIGNALING_VIA_NFKB 183 0.4990590 1.574185 2.165493e-04 1.421746e-03 9.877396e-04 2439 tags=37%, list=17%, signal=31% 13537/12515/20304/13655/16197/22029/12522/16598/14282/321019/50723/21942/20621/20527/18037/170768/14825/19288/12519/17681/55991/12702/12044/12156/70789/16176/13654/18578/14281/56193/227659/13653/19696/15945/21928/14373/21354/19219/17873/16173/56336/15939/56708/19698/16449/19225/17869/21930/106869/11796/18124/15894/14528/21664/104681/21929/11303/16878/12575/14579/20296/15958/16323/99929/240672/57783/75234
HALLMARK_KRAS_SIGNALING_UP HALLMARK_KRAS_SIGNALING_UP HALLMARK_KRAS_SIGNALING_UP 181 0.5044412 1.588134 2.564028e-04 1.424460e-03 9.896247e-04 2464 tags=41%, list=18%, signal=34% 12683/57264/12493/16186/22778/16625/16197/12767/23871/22029/26411/57875/67468/19699/17395/20440/20255/94176/16792/18613/71683/19734/21788/16154/18671/16913/12156/23882/16176/67888/79221/18080/19092/66066/19279/15945/14373/16414/12444/21938/84094/12349/14962/20394/16009/16574/19225/11732/15900/56743/17385/93695/15483/74145/19662/11796/12142/104156/21929/14396/14257/13040/16878/13805/16398/83397/72309/18186/16323/18933/73149/16534/20266/77125
HALLMARK_COMPLEMENT HALLMARK_COMPLEMENT HALLMARK_COMPLEMENT 179 0.4985989 1.569030 3.341847e-04 1.670923e-03 1.160852e-03 3115 tags=44%, list=22%, signal=34% 12902/16818/11472/20304/27226/19419/22376/210293/16822/20255/14360/14825/14710/14462/12363/12491/22762/12266/14938/70789/80906/110197/13036/14069/320207/70574/14067/12263/56193/14702/16912/14696/12369/12258/21417/238130/12349/14962/17436/12569/667277/12362/18712/16923/12759/14268/380921/20701/11812/18132/13032/13136/21929/14723/13040/54123/21789/11808/56212/72269/12259/240672/13033/14939/20196/12262/12608/208650/17096/16612/13660/71753/14571/14678/14061/22074/15378/27361
HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS 192 0.4900602 1.546815 4.935926e-04 2.243603e-03 1.558714e-03 4439 tags=50%, list=32%, signal=35% 19139/20135/21973/11799/17345/76131/218977/26934/12442/229841/70099/73804/20877/66570/12447/21853/16571/72391/12580/17121/17215/15366/67629/17869/66929/105988/17218/107995/54141/52276/12236/14793/70218/110033/18817/108961/22042/15201/53605/13178/16906/17089/12575/12704/18969/18973/19362/69270/67196/16765/67241/12021/22154/54124/12534/14056/17219/72107/17217/21877/69639/15361/22059/20937/19183/17216/12190/98386/53892/114714/50927/60364/17220/12531/66422/17279/97165/17865/11991/66197/66403/17768/18813/103733/78833/67134/18971/66442/30939/16647/13006/100336/69716/110052/100710/21335
HALLMARK_IL2_STAT5_SIGNALING HALLMARK_IL2_STAT5_SIGNALING HALLMARK_IL2_STAT5_SIGNALING 182 0.4669871 1.471813 2.235579e-03 9.314911e-03 6.471412e-03 2253 tags=35%, list=16%, signal=29% 20343/15985/54167/74734/13011/22029/12522/16407/16994/21942/16185/20527/16364/16182/384009/13813/12524/13808/19734/19228/22163/16154/18671/12156/12506/16190/14256/16178/18550/381319/12703/11492/100604/20344/19217/16184/15945/12444/21938/12349/18619/12447/11861/17873/12043/23834/14064/17988/20947/14744/170743/17869/15900/214547/18712/55932/21936/22035/21664/13601/11596/16878/17760
HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT 186 0.4506709 1.421167 6.322301e-03 2.431654e-02 1.689360e-02 3770 tags=43%, list=27%, signal=32% 12615/21973/11799/68612/17345/72119/16551/12235/71819/108907/26934/12442/78733/229841/70099/73804/20877/67052/242705/16571/12545/18005/72391/12580/23834/17215/12428/15366/17536/17869/105988/17218/108000/107995/21803/209737/12449/22137/110033/18817/67141/16906/52033/13555/19366/18969/18973/240641/16765/20539/12021/26909/54124/12534/233406/14056/22036/16319/73086/17219/60406/15361/20937/17216/12190/98386/11987/12443/50927/12531/17765/27221/17118/19650/238247/12544/17865/27214/11991/66197
HALLMARK_ADIPOGENESIS HALLMARK_ADIPOGENESIS HALLMARK_ADIPOGENESIS 192 0.4484965 1.415624 7.816571e-03 2.791632e-02 1.939450e-02 3818 tags=38%, list=27%, signal=28% 12683/11450/57264/11770/16846/77037/19016/19218/246747/57875/14778/13120/18606/17001/170768/16404/12491/12266/20509/16890/16956/20618/18405/17436/11816/20887/11861/12580/52538/170439/67834/66205/103172/11303/13350/27376/51798/23945/11364/14085/13850/109019/13602/14792/209378/21881/15979/12908/14571/68349/20778/66445/20850/27047/15929/26379/12896/11429/74185/11363/235339/110826/13476/13830/67460/22628/13382/15107/12053/27395/66142/66447/12974
HALLMARK_APICAL_JUNCTION HALLMARK_APICAL_JUNCTION HALLMARK_APICAL_JUNCTION 172 0.4465108 1.399748 1.194616e-02 3.982052e-02 2.766478e-02 2527 tags=31%, list=18%, signal=26% 69165/11474/11459/18175/11472/23912/18417/19354/22029/19264/14086/12741/15896/17395/21838/14026/60533/16511/12524/22329/320840/269116/23792/18613/20963/83964/60363/68794/54419/17698/73341/94332/15898/239857/21810/23797/13507/12798/12490/26412/245537/12805/20970/11548/15894/104099/16398/17390/105827/232975/23794/64540/16367
HALLMARK_APOPTOSIS HALLMARK_APOPTOSIS HALLMARK_APOPTOSIS 153 0.4426968 1.378980 1.596565e-02 4.989267e-02 3.466228e-02 2425 tags=27%, list=17%, signal=22% 16842/12481/12515/13655/14778/18796/380684/18646/58801/12363/12156/16176/21973/12257/12122/12369/16012/12444/14676/20515/21354/17873/235180/16173/14102/24014/11732/12362/12759/12389/13807/11796/227753/14528/22035/19401/13179/12575/17390/21814/12494
  1. GSEA: Par vs SM
# Created ranked list for Par_vs_SM
ranked_Par_vs_SM <- create_ranked_list(results_Par_vs_SM)

# Run GSEA
gsea_Par_vs_SM <- GSEA(
  ranked_Par_vs_SM,
  TERM2GENE = hallmark_gene_sets,
  pvalueCutoff = 0.05,
  eps = 0,            # Set eps to zero for better estimation
  verbose = FALSE
)

# Extracted pathways with P-value == 1e-10 (extremely significant)
extremely_significant_Par_vs_SM <- gsea_Par_vs_SM@result %>%
  filter(pvalue == 1e-10)

# Converted GSEA results to a data frame
gsea_results_df_Par_vs_SM <- as.data.frame(gsea_Par_vs_SM)

# Combined top 15 pathways with extremely significant ones
combined_top_Par_vs_SM <- gsea_results_df_Par_vs_SM %>%
  arrange(p.adjust) %>%
  head(15) %>%
  bind_rows(extremely_significant_Par_vs_SM)

# Removed potential duplicates based on Description
combined_top_Par_vs_SM <- combined_top_Par_vs_SM %>%
  distinct(Description, .keep_all = TRUE)

# Created a bar plot for Par_vs_SM
ggplot(combined_top_Par_vs_SM, aes(x = reorder(Description, NES), y = NES, fill = NES)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_gradient(low = "lightblue", high = "lightcoral") +
  theme_minimal() +
  labs(title = "Top Enriched Gene Sets - Par_vs_SM",
       x = "Gene Set",
       y = "Normalized Enrichment Score (NES)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.text = element_text(size = 12)
  )

# Displayed the top 20 GSEA results in a scrollable table
kable(head(gsea_Par_vs_SM, 20), digits=50) %>% kable_styling("striped", full_width = FALSE, position = "center") %>% scroll_box(width = "100%", height = "500px")
ID Description setSize enrichmentScore NES pvalue p.adjust qvalue rank leading_edge core_enrichment
HALLMARK_ALLOGRAFT_REJECTION HALLMARK_ALLOGRAFT_REJECTION HALLMARK_ALLOGRAFT_REJECTION 172 0.7542856 2.211116 9.858377e-22 4.929189e-20 3.216944e-20 2263 tags=62%, list=16%, signal=52% 12526/16678/12500/12518/12525/12502/16428/15002/12501/12503/55985/12504/16818/16994/12481/50931/20299/84544/16186/22637/21939/12487/20304/66211/26411/20849/16364/16170/19264/17329/23871/22376/21838/12516/17076/15001/12519/19073/17395/16161/16408/56501/18646/20306/12703/17972/16185/12766/20296/20452/16643/16414/16190/12524/15163/18751/16149/14423/14960/50723/14991/12444/15894/16173/21926/18636/14998/16878/16174/226419/14102/16196/15170/79221/14999/14938/12010/14191/19171/14469/16184/16176/12051/216799/21354/21803/15976/12479/15900/13040/16822/17084/12263/17086/11423/80719/12189/56744/20846/12772/14744/14130/20085/15007/15979/21355
HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE 187 0.6716847 1.978911 9.386127e-14 2.346532e-12 1.531421e-12 2654 tags=55%, list=19%, signal=45% 242248/71939/14990/110168/12984/12515/21939/16421/20304/20849/16364/17329/12265/76074/60533/75345/20306/12703/16185/22329/20296/108670/236573/20452/17472/16190/58203/246727/12363/12702/12524/209086/16149/14969/14960/21929/14991/15894/27056/626578/16912/14998/16154/12362/12494/14962/14102/16196/15170/16913/55932/12226/14938/15945/12010/56791/19171/12258/18712/209588/70082/12575/21354/22035/15976/20344/15959/27007/15900/434341/234311/246728/16822/547253/15958/21928/11861/20846/67138/667277/192786/69550/66892/65972/15007/14964/14190/56045/107607/67775/59027/229003/217069/19225/16362/57444/20821/54123/18035/327959/24110/19039
HALLMARK_ANDROGEN_RESPONSE HALLMARK_ANDROGEN_RESPONSE HALLMARK_ANDROGEN_RESPONSE 107 -0.6667696 -2.405666 4.014171e-12 6.690285e-11 4.366291e-11 787 tags=24%, list=6%, signal=23% 14229/30051/17132/328365/544963/74205/53416/21985/17761/13521/16612/21807/56847/20393/16613/16622/16617/16623/16615/18048/16619/13648/13646/16616/18050/16624
HALLMARK_MYOGENESIS HALLMARK_MYOGENESIS HALLMARK_MYOGENESIS 176 0.6268406 1.842645 5.253310e-09 6.566637e-08 4.285595e-08 2181 tags=40%, list=16%, signal=34% 17879/19293/11459/11474/21925/59011/21957/14077/17884/21953/17930/17901/17907/11537/20190/11937/76722/12862/21393/12715/15464/17189/12372/13346/11472/24131/20856/19309/20249/12491/56012/16985/14199/20657/14778/11489/17896/24052/17882/16511/13009/13808/17929/16404/12759/21955/17260/19245/56188/16000/12575/17873/12865/21803/12323/12554/23796/58226/16009/29817/11423/52250/18767/18563/69253/11568/12825/12017/14081/227753
HALLMARK_INFLAMMATORY_RESPONSE HALLMARK_INFLAMMATORY_RESPONSE HALLMARK_INFLAMMATORY_RESPONSE 171 0.6162798 1.805316 1.634422e-08 1.634422e-07 1.066675e-07 2323 tags=46%, list=17%, signal=39% 17167/12775/20343/16818/20299/16992/16197/12515/21939/56696/54483/20304/326623/269346/18413/17329/50778/80901/68713/233079/13136/20306/12506/21942/16185/20296/321019/16190/19217/19219/18793/50723/56221/15894/320207/20288/16173/21938/13732/16182/16878/16154/16174/54371/19734/12986/15945/11303/14676/54598/21950/19204/16176/12575/216799/22035/50498/14734/20339/23796/13058/15465/17387/19419/16822/16402/16175/54216/56212/20354/23845/17869/69550/16416/18414/67775/17096/59027
HALLMARK_IL6_JAK_STAT3_SIGNALING HALLMARK_IL6_JAK_STAT3_SIGNALING HALLMARK_IL6_JAK_STAT3_SIGNALING 80 0.6641959 1.825662 2.862486e-06 2.385405e-05 1.556791e-05 2261 tags=46%, list=16%, signal=39% 55985/16994/16186/12984/16199/14825/17329/12491/16401/16847/16161/16178/20306/12703/16190/12702/320207/21926/21938/16182/12494/14102/16196/12986/15945/16172/16184/18712/16176/21803/50498/16188/56744/20846/16416/15979/18414
HALLMARK_COMPLEMENT HALLMARK_COMPLEMENT HALLMARK_COMPLEMENT 179 0.5618742 1.651029 1.179749e-05 8.426777e-05 5.499581e-05 2810 tags=40%, list=20%, signal=33% 12902/17002/16818/14462/11472/20304/14825/12491/27226/22376/70574/13136/210293/17436/72461/14069/18793/12363/56193/21929/20255/12266/14696/12759/80906/14360/320207/14710/16912/12362/14962/14702/22762/14938/70789/12569/14268/21417/12258/18712/14067/13039/12554/16889/13040/17387/19419/16822/229445/13036/12263/14678/56212/667277/21789/18643/14571/17096/13032/13030/16362/14066/19141/54123/30955/380921/12262/16923/240672/12259/18783/13660
HALLMARK_PROTEIN_SECRETION HALLMARK_PROTEIN_SECRETION HALLMARK_PROTEIN_SECRETION 94 -0.5236179 -1.873083 1.544893e-05 9.655582e-05 6.301538e-05 3203 tags=51%, list=23%, signal=40% 20479/20333/56334/59042/11840/74006/17113/211673/20844/108123/213827/320634/22088/66251/70349/12068/59021/67074/67300/16004/11765/50797/12725/11977/54214/107338/69162/271457/16561/16668/69608/108664/56041/15893/20619/70361/107767/71943/18484/11928/99371/77929/20404/21985/110935/53331/14420/216350
HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS 192 0.5475710 1.614163 4.564068e-05 2.535593e-04 1.654808e-04 4112 tags=52%, list=29%, signal=37% 76131/72391/11799/70099/66929/73804/20135/20877/12442/107995/17121/21973/18817/17345/70218/17215/17218/26934/12447/15366/13178/108961/218977/229841/110033/12580/12575/52276/67629/16906/14793/12534/16571/12531/17279/66570/19362/15201/54141/17219/18973/12236/20937/21853/97165/12189/17217/19139/67196/17869/98386/16765/69270/18969/22154/15361/17865/20878/21335/72107/114714/69639/21877/78833/20873/12021/17220/12704/60364/22059/56150/68219/53381/19183/17768/66131/17089/14056/16881/67134/18538/53605/19076/66403/19891/110052/12190/100710/19718/53892/100336/103573/16647/19384/69716/212377/50927/19355/18971
HALLMARK_INTERFERON_ALPHA_RESPONSE HALLMARK_INTERFERON_ALPHA_RESPONSE HALLMARK_INTERFERON_ALPHA_RESPONSE 89 0.6211967 1.723823 6.359629e-05 3.179814e-04 2.075247e-04 3617 tags=61%, list=26%, signal=45% 20343/68713/108670/16190/209086/16149/12799/16912/12362/16196/16913/55932/15945/12010/56791/14469/21354/15959/234311/246730/547253/15958/74153/67138/69550/65972/211329/15007/67775/229003/217069/16362/57444/20821/54123/24110/19039/67168/22169/74481/23960/19188/246696/319236/17069/66355/19124/66141/12370/17858/16363/16423/100038882/72075
HALLMARK_TNFA_SIGNALING_VIA_NFKB HALLMARK_TNFA_SIGNALING_VIA_NFKB HALLMARK_TNFA_SIGNALING_VIA_NFKB 183 0.5153072 1.514859 5.742110e-04 2.610050e-03 1.703401e-03 1681 tags=31%, list=12%, signal=27% 13537/16197/12515/20304/14825/55991/20621/13655/22029/12522/16598/20527/12519/18037/19288/170768/21942/20296/321019/19219/18793/19696/12702/56193/21929/50723/56708/15894/16173/21926/21923/13654/16878/17681/71712/56336/14282/227659/12044/12226/19698/12156/70789/15945/11303/21950/16176/14281/12575/12051/15939/14579/17873/21354/57783/106869
HALLMARK_IL2_STAT5_SIGNALING HALLMARK_IL2_STAT5_SIGNALING HALLMARK_IL2_STAT5_SIGNALING 182 0.5119814 1.506545 6.493635e-04 2.705681e-03 1.765813e-03 2255 tags=37%, list=16%, signal=32% 20343/15985/16994/54167/74734/16364/13011/384009/22029/12522/16407/20527/16178/22163/12506/12703/21942/16185/13808/16190/19217/13813/12524/14256/100604/12444/21938/16182/16878/11520/16154/12043/19228/11492/19734/67547/381319/20947/55932/12156/15945/18619/18671/12447/170743/21936/16184/53314/18712/14085/22781/17873/22035/20344/16188/18550/23834/15900/17988/64138/72729/11861/13601/17869/229521/14744/14190/15979
HALLMARK_ADIPOGENESIS HALLMARK_ADIPOGENESIS HALLMARK_ADIPOGENESIS 192 0.5014731 1.478273 1.496920e-03 5.757386e-03 3.757452e-03 2286 tags=22%, list=16%, signal=19% 12683/246747/11450/57264/11770/77037/16846/19016/12491/19218/13120/57875/14778/17436/170768/16956/18606/17001/16404/12266/11520/13830/11303/16890/23943/12580/14085/20618/16775/13358/170439/52538/11861/11816/67460/18405/18569/11933/15979/15926/66205/14571
HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT 186 0.4878541 1.439638 4.009933e-03 1.432119e-02 9.346461e-03 2886 tags=38%, list=21%, signal=30% 242705/12615/22137/72391/11799/70099/12235/73804/71819/20877/12442/16551/107995/26909/72119/21973/18817/18005/68612/17345/12428/17215/108907/17218/26934/15366/229841/110033/12580/67052/12051/21803/16906/240641/12534/16571/12531/23834/12449/108000/17219/18973/52033/78733/20937/67141/209737/17869/22036/98386/12545/16765/18969/16319/27214/20539/15361/17865/20878/21335/107503/17118/14211/20873/12021/56150/17536/233406/19650/20460
HALLMARK_KRAS_SIGNALING_UP HALLMARK_KRAS_SIGNALING_UP HALLMARK_KRAS_SIGNALING_UP 181 0.4858889 1.428489 5.523955e-03 1.841318e-02 1.201703e-02 2617 tags=39%, list=19%, signal=32% 12683/57264/12493/16186/16197/12767/22778/20568/26411/23871/22029/57875/94176/67468/16625/17395/93695/20440/16414/16792/18793/20394/71683/21929/20255/12444/21788/11501/21938/18613/16878/16154/14962/56743/19734/16913/58860/79221/11421/19699/12156/15945/18671/14396/16176/67888/14257/18703/15900/13040/73149/66066/16009/12142/20716/20616/11846/14063/14009/170744/74145/18933/19092/72309/11732/17385/19225/11796/20230/84094/83397
  1. GSEA: SL vs SM
# Created ranked list for SL_vs_SM
ranked_SL_vs_SM <- create_ranked_list(results_SL_vs_SM)

# Run GSEA
gsea_SL_vs_SM <- GSEA(
  ranked_SL_vs_SM,
  TERM2GENE = hallmark_gene_sets,
  pvalueCutoff = 0.05,
  verbose = FALSE
)

#Skipping extremely significant step as there are very few enrichments in this group

#Plotting enrichment score for Hallmark Androgen Response
gseaplot2(gsea_SL_vs_SM, geneSetID = "HALLMARK_ANDROGEN_RESPONSE", title = "HALLMARK_ANDROGEN_RESPONSE")

END OF REPORT